City College of San Francisco
MATH 108 - Foundations of Data Science
Associated Textbook Sections: 17.4 - 17.6
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
from mpl_toolkits.mplot3d import Axes3D
ckd = Table.read_table('data/ckd.csv')
ckd = ckd.relabeled('Blood Glucose Random', 'Glucose').select('Glucose', 'Hemoglobin', 'White Blood Cell Count', 'Class')
patients = Table.read_table('data/breast-cancer.csv').drop('ID')
def randomize_column(a):
return a + np.random.normal(0.0, 0.09, size=len(a))
jittered = Table().with_columns([
'Bland Chromatin (jittered)',
randomize_column(patients.column('Bland Chromatin')),
'Single Epithelial Cell Size (jittered)',
randomize_column(patients.column('Single Epithelial Cell Size')),
'Class',
patients.column('Class')
])
Brittany Wenger, a 17-year-old high school student in 2012 won by building a breast cancer classifier with 99% accuracy.
Load the breast-cancer.csv data set that contains various cell measurements and a Class labeling of benign 0 or malignant 1. This data was manually produced by medical professionals that provided the values for each attribute based on visually inspecting images.
# rows = individuals
# columns - attributes of cells
# class is determined by an expert
patients = Table.read_table('data/breast-cancer.csv').drop('ID')
patients.show(5)
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class |
|---|---|---|---|---|---|---|---|---|---|
| 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 | 0 |
| 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 0 |
| 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 0 |
| 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 0 |
| 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 0 |
... (678 rows omitted)
patients.group('Class')
| Class | count |
|---|---|
| 0 | 444 |
| 1 | 239 |
Visualize the relationship between Bland Chromatin and Single Epithelial Cell Size in order to try and classify the cell. Notice that it doesn't seem like there are hundreds of data points in the scatterplot.
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', group='Class')
An issue with the data is there are not very many numerical values (they are more like categories/rankings), so there are a lot of repeated data points. The jittered table adds some random measurement noise to those data points values.
# jittering adds a little noise and allows us to see the numbers of points at each grid point.
jittered.scatter(0, 1, group='Class')
Each row contains all the data for one individual
t.row(i) evaluates to ith row of table tt.row(i).item(j) is the value of column j in row inp.array(t.row(i)) evaluates to an array of all the numbers in the row.for row in t.rows:
... row.item(j) ...t.exclude(i) evaluates to the table t without its ith rowFor a right triangle with legs $a, b$ and hypotenuse $c$, the following relationship is always true: $$a^2 + b^2 = c^2.$$
One way to calculate the distance between two points utilizes a right triangle and depends on the number of attributes each point has.
Create a function to calculate the distance between 2 points (represented as arrays). Use that function to calculate the distance between two rows of numeric data from a table.
# use array arithmetic it will do term by term subtraction
def distance(pt1, pt2):
"""Return the distance between two points, represented as arrays"""
return np.sqrt(sum((pt2 - pt1)**2))
def row_distance(row1, row2):
"""Return the distance between two numerical rows of a table"""
return distance(np.array(row1), np.array(row2))
Clean up the patients data to be able to apply the row_distance function to it.
# Don't want class column in the data set.
attributes = patients.drop('Class')
attributes.show(3)
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses |
|---|---|---|---|---|---|---|---|---|
| 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 |
| 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 |
| 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 |
... (680 rows omitted)
row_distance(attributes.row(0), attributes.row(1))
11.874342087037917
row_distance(attributes.row(0), attributes.row(2))
2.2360679774997898
row_distance(attributes.row(2), attributes.row(2))
0.0
k Nearest Neighbors¶To find the k nearest neighbors of an example:
k rows of the sorted tableTo classify a point:
k nearest neighborsk nearest neighbors to see which of the two classes appears more oftenThe following function calculates the distance between an example row (an individual patient) and every row in the a training set of patient data.
# takes in training set and a row, computes the distance the row is from each member of the
# training set and then attaches those distances to the training table
def distances(training, example):
"""
Compute distance between example and every row in training.
Return training augmented with Distance column
"""
distances = make_array()
attributes_only = training.drop('Class')
for row in attributes_only.rows:
distances = np.append(distances, row_distance(row, example))
# ^ SAME AS DOING:
#
# for i in np.arange(attributes_only.num_rows):
# row = attributes_only.row(i)
# distances = np.append(distances, row_distance(row, example))
return training.with_column('Distance_to_ex', distances)
Measure the distance between row 21 and every other row in the data set.
example = attributes.row(21)
example
Row(Clump Thickness=10, Uniformity of Cell Size=5, Uniformity of Cell Shape=5, Marginal Adhesion=3, Single Epithelial Cell Size=6, Bare Nuclei=7, Bland Chromatin=7, Normal Nucleoli=10, Mitoses=1)
# Exclude 21 from the calculation, find all distances to 21 and then sort.
distances(patients.exclude(21), example).sort('Distance_to_ex')
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class | Distance_to_ex |
|---|---|---|---|---|---|---|---|---|---|---|
| 8 | 4 | 4 | 5 | 4 | 7 | 7 | 8 | 2 | 0 | 4.3589 |
| 10 | 5 | 7 | 4 | 4 | 10 | 8 | 9 | 1 | 1 | 4.47214 |
| 7 | 4 | 4 | 3 | 4 | 10 | 6 | 9 | 1 | 1 | 5.09902 |
| 10 | 3 | 6 | 2 | 3 | 5 | 4 | 10 | 2 | 1 | 5.38516 |
| 10 | 5 | 5 | 6 | 3 | 10 | 7 | 9 | 2 | 1 | 5.38516 |
| 10 | 6 | 6 | 2 | 4 | 10 | 9 | 7 | 1 | 1 | 5.38516 |
| 9 | 7 | 7 | 5 | 5 | 10 | 7 | 8 | 3 | 1 | 5.56776 |
| 10 | 6 | 4 | 3 | 10 | 10 | 9 | 10 | 1 | 1 | 5.56776 |
| 8 | 7 | 4 | 4 | 5 | 3 | 5 | 10 | 1 | 1 | 5.56776 |
| 10 | 7 | 7 | 4 | 5 | 10 | 5 | 7 | 2 | 1 | 5.74456 |
... (672 rows omitted)
Create a function that finds the k closest row to the example row. Apply that function to the situation above.
def closest(training, example, k):
"""
Return a table of the k closest neighbors to example
"""
return distances(training, example).sort('Distance_to_ex').take(np.arange(k))
# In training set finding the 5 nearest neighbors to patient 21.
closest(patients.exclude(21), example, 5)
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class | Distance_to_ex |
|---|---|---|---|---|---|---|---|---|---|---|
| 8 | 4 | 4 | 5 | 4 | 7 | 7 | 8 | 2 | 0 | 4.3589 |
| 10 | 5 | 7 | 4 | 4 | 10 | 8 | 9 | 1 | 1 | 4.47214 |
| 7 | 4 | 4 | 3 | 4 | 10 | 6 | 9 | 1 | 1 | 5.09902 |
| 10 | 3 | 6 | 2 | 3 | 5 | 4 | 10 | 2 | 1 | 5.38516 |
| 10 | 5 | 5 | 6 | 3 | 10 | 7 | 9 | 2 | 1 | 5.38516 |
Create a function or functions to report the majority class for the nearest k rows to the example row.
# What will the prediction be? Tally the results in the k nearest neighbors.
closest(patients.exclude(21), example, 5).group('Class').sort('count', descending=True)
| Class | count |
|---|---|
| 1 | 4 |
| 0 | 1 |
def majority_class(topk):
"""
Return the class with the highest count
"""
return topk.group('Class').sort('count', descending=True).column(0).item(0)
def classify(training, example, k):
"""
Return the majority class among the
k nearest neighbors of example
"""
return majority_class(closest(training, example, k))
# The prediction for patient 21.
classify(patients.exclude(21), example, 5)
1
# Check if the prediction is correct.
patients.take(21)
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class |
|---|---|---|---|---|---|---|---|---|---|
| 10 | 5 | 5 | 3 | 6 | 7 | 7 | 10 | 1 | 1 |
Apply this technique to other example rows from the data set.
new_example = attributes.row(10)
classify(patients.exclude(10), new_example, 5)
0
patients.take(10)
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | 1 | 1 | 1 | 3 | 1 | 1 | 0 |
another_example = attributes.row(15)
classify(patients.exclude(15), another_example, 5)
0
patients.take(15)
| Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class |
|---|---|---|---|---|---|---|---|---|---|
| 7 | 4 | 6 | 4 | 6 | 1 | 4 | 3 | 1 | 1 |
distance(pt1, pt2): Returns the distance between the arrays pt1 and pt2row_distance(row1, row2): Returns the distance between the rows row1 and row2distances(training, example): Returns a table that is training with an additional column 'Distance' that contains the distance between example and each row of trainingclosest(training, example, k): Returns a table of the rows corresponding to the k smallest distancesmajority_class(topk): Returns the majority class in the 'Class' columnclassify(training, example, k): Returns the predicted class of example based on a k nearest neighbors classifier using the historical sample trainingSplit up the patients data into two tables where approximately 80% of the data is used for training and 20% is used for testing.
round(patients.num_rows * 0.8)
546
# Forming training and test sets. 80%/20% split.
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(round(patients.num_rows * 0.8)))
test_set = shuffled.take(np.arange(round(patients.num_rows * 0.8), patients.num_rows))
Create a function that evaluates the accuracy by returning the proportion of correctly classified examples in the test set.
# Accuracy is the proportion of those that were predicted correctly.
def evaluate_accuracy(training, test, k):
"""Return the proportion of correctly classified examples
in the test set"""
test_attributes = test.drop('Class')
num_correct = 0
for i in np.arange(test.num_rows):
c = classify(training, test_attributes.row(i), k)
num_correct = num_correct + (c == test.column('Class').item(i)) # checking to see if prediction was correct
return num_correct / test.num_rows
# Accuracy for 5 nearest neighbors.
evaluate_accuracy(training_set, test_set, 5)
0.9781021897810219
# Accuracy for 3 nearest neighbors.
evaluate_accuracy(training_set, test_set, 3)
0.9708029197080292
evaluate_accuracy(training_set, test_set, 11)
0.9635036496350365
evaluate_accuracy(training_set, test_set, 1)
0.948905109489051
Table().with_columns(
'Glucose', make_array(117, 70, 380, 157),
'Hemoglobin', make_array(11.2, 9.5, 10.8, 5.6),
'White Blood Cell Count', make_array(6700, 12100, 4500, 11000),
'Class', make_array(1, 1, 1, 1)
)
| Glucose | Hemoglobin | White Blood Cell Count | Class |
|---|---|---|---|
| 117 | 11.2 | 6700 | 1 |
| 70 | 9.5 | 12100 | 1 |
| 380 | 10.8 | 4500 | 1 |
| 157 | 5.6 | 11000 | 1 |
Explore what can happen to the classifier's accuracy when the units are not standardized.
def standard_units(x):
return (x - np.average(x)) / np.std(x)
ckd_new = ckd.select('Class').with_columns(
'Glucose_su', standard_units(ckd.column('Glucose')),
'Hemoglobin_su', standard_units(ckd.column('Hemoglobin')),
'WBC_su', standard_units(ckd.column('White Blood Cell Count'))
)
ckd_new
| Class | Glucose_su | Hemoglobin_su | WBC_su |
|---|---|---|---|
| 1 | -0.221549 | -0.865744 | -0.569768 |
| 1 | -0.947597 | -1.45745 | 1.16268 |
| 1 | 3.84123 | -1.00497 | -1.27558 |
| 1 | 0.396364 | -2.81488 | 0.809777 |
| 1 | 0.643529 | -2.08395 | 0.232293 |
| 1 | -0.561402 | -1.35303 | -0.505603 |
| 1 | 2.04928 | -0.413266 | 0.360623 |
| 1 | -0.947597 | -1.28342 | 3.34429 |
| 1 | 1.87936 | -1.10939 | -0.409356 |
| 1 | 0.489051 | -1.35303 | 1.96475 |
... (148 rows omitted)
#standardization done
shuffled = ckd_new.sample(with_replacement=False)
training_set = shuffled.take(np.arange(74))
test_set = shuffled.take(np.arange(74, 148))
evaluate_accuracy(training_set, test_set, 3)
0.9594594594594594
# not standardized units
shuffled = ckd.sample(with_replacement=False)
training_set = shuffled.take(np.arange(74))
test_set = shuffled.take(np.arange(74, 148))
evaluate_accuracy(training_set, test_set, 3)
0.7972972972972973